lab11

Author

Vanessa Ramos

Step 1- read in the data

library(readr)
library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(zoo)

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
## data extracted from New York Times state-level data from NYT Github repository
# https://github.com/nytimes/covid-19-data

## state-level population information from us_census_data available on GitHub repository:
# https://github.com/COVID19Tracking/associated-data/tree/master/us_census_data

### FINISH THE CODE HERE ###
# load COVID state-level data from NYT
cv_states <- as.data.frame(read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))

### FINISH THE CODE HERE ###
# load state population data
state_pops <- as.data.frame(read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
str(state_pops)
'data.frame':   52 obs. of  5 variables:
 $ state      : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ geo_id     : int  1 2 4 5 6 8 9 10 11 12 ...
 $ population : int  4887871 737438 7171646 3013825 39557045 5695564 3572665 967171 702455 21299325 ...
 $ pop_density: num  96.51 1.29 63.14 57.92 253.91 ...
 $ abb        : chr  "AL" "AK" "AZ" "AR" ...
str(cv_states)
'data.frame':   61942 obs. of  5 variables:
 $ date  : chr  "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" ...
 $ state : chr  "Washington" "Washington" "Washington" "Illinois" ...
 $ fips  : int  53 53 53 17 53 6 17 53 4 6 ...
 $ cases : int  1 1 1 1 1 1 1 1 1 2 ...
 $ deaths: int  0 0 0 0 0 0 0 0 0 0 ...
### FINISH THE CODE HERE
cv_states <- merge(
  # Data
  x     = cv_states,      
  y     = state_pops,
  # List of variables to match
  by.x  = "state",
  by.y  = "state", 
  # Which obs to keep?
  all.x = TRUE,      
  all.y = TRUE
  )

Step 2: Look at the data

The data appears to be in correct format.

dim(cv_states)
[1] 61942     9
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04    1 1587224  21263      1    4887871    96.50939  AL
2 Alabama 2020-04-25    1    6213    213      1    4887871    96.50939  AL
3 Alabama 2023-02-26    1 1638348  21400      1    4887871    96.50939  AL
4 Alabama 2022-12-03    1 1549285  21129      1    4887871    96.50939  AL
5 Alabama 2020-05-06    1    8691    343      1    4887871    96.50939  AL
6 Alabama 2021-04-21    1  524367  10807      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
61937 Wyoming 2022-09-11   56 175290   1884     56     577737    5.950611  WY
61938 Wyoming 2022-08-21   56 173487   1871     56     577737    5.950611  WY
61939 Wyoming 2021-01-26   56  51152    596     56     577737    5.950611  WY
61940 Wyoming 2021-02-21   56  53795    662     56     577737    5.950611  WY
61941 Wyoming 2021-08-22   56  70671    809     56     577737    5.950611  WY
61942 Wyoming 2021-03-20   56  55581    693     56     577737    5.950611  WY
str(cv_states)
'data.frame':   61942 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : chr  "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
 $ deaths     : int  21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...

Step 3: Format the data

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")

# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)

### FINISH THE CODE HERE 
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
61750 Wyoming 2023-03-18   56 185640   2009     56     577737    5.950611  WY
61764 Wyoming 2023-03-19   56 185640   2009     56     577737    5.950611  WY
61495 Wyoming 2023-03-20   56 185640   2009     56     577737    5.950611  WY
61715 Wyoming 2023-03-21   56 185800   2014     56     577737    5.950611  WY
61905 Wyoming 2023-03-22   56 185800   2014     56     577737    5.950611  WY
61660 Wyoming 2023-03-23   56 185800   2014     56     577737    5.950611  WY

Based on observations of the dataset structure, I can confirm that the variables are correctly formatted.

# Inspect the range values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
597  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
282  Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
12   Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
266  Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
78   Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
           state            date                 fips           cases         
 Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
 Illinois     : 1155   1st Qu.:2020-12-09   1st Qu.:17.00   1st Qu.:   70784  
 California   : 1154   Median :2021-09-16   Median :31.00   Median :  351496  
 Arizona      : 1153   Mean   :2021-09-13   Mean   :32.25   Mean   :  889830  
 Massachusetts: 1147   3rd Qu.:2022-06-20   3rd Qu.:46.00   3rd Qu.: 1043231  
 Wisconsin    : 1143   Max.   :2023-03-23   Max.   :78.00   Max.   :12169158  
 (Other)      :55032                                                          
     deaths           geo_id        population        pop_density       
 Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
 1st Qu.:  1171   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
 Median :  5035   Median :29.00   Median : 4468402   Median :  107.860  
 Mean   : 11779   Mean   :29.78   Mean   : 6397965   Mean   :  423.031  
 3rd Qu.: 14912   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
 Max.   :104277   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
                  NA's   :3848    NA's   :3848       NA's   :4954       
      abb       
 WA     : 1158  
 IL     : 1155  
 CA     : 1154  
 AZ     : 1153  
 MA     : 1147  
 (Other):52327  
 NA's   : 3848  
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

The range of variables are listed in the following codes:

diff(range(cv_states$date))
Time difference of 1157 days
diff(range(cv_states$cases))
[1] 12169157
diff(range(cv_states$deaths))
[1] 104277

Step 4: Add new_cases and new_deaths and correct outliers

# Add variables for new_cases and new_deaths:
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]
}
  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]

  ### FINISH THE CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$new_cases[j-1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$new_deaths[j-1]
  }

  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths


# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")
### FINISH THE CODE HERE
# Inspect outliers in new_cases using plotly
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() +
  geom_point(size = .5, alpha = 0.5)
ggplotly(p1) 
p1
Warning: Removed 36242 rows containing missing values (`geom_line()`).
Warning: Removed 36242 rows containing missing values (`geom_point()`).

#p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_smooth() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 36242 rows containing non-finite values (`stat_smooth()`).
p2<-NULL # to clear from workspace

# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0

# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])

  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]
}
 ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j]
  
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)

# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
#p2=NULL

Step 5: Add additional variables

### FINISH CODE HERE
# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
Warning: NAs introduced by coercion
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))

# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date==max(cv_states$date))

Section 2: Scatterplots

Step 6: Explore scatterplots using plot_ly()

### FINISH CODE HERE

# pop_density vs. cases
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 5 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")

# pop_density vs. cases after filtering
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
Warning: Ignoring 5 observations

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# pop_density vs. deathsper100k
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
No trace type specified:
  Based on info supplied, a 'scatter' trace seems appropriate.
  Read more about this trace type -> https://plotly.com/r/reference/#scatter
No scatter mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Warning: Ignoring 5 observations
Warning: `line.width` does not currently support multiple values.
# Adding hoverinfo
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste( paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep="") , 
                         paste(" Deaths per 100k: ", per100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
                  yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")
Warning: Ignoring 5 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Step 7: Explore scatterplot trend interactively using ggplotly() and geom_smooth()

### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_line()
ggplotly(p)

There seems to be a positive relationship with population density and deaths (per 100k)

Step 8: Multiple line chart

### FINISH CODE HERE
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

After approximately Jan 2022, most states seem to level out in naive_CFR

### FINISH CODE HERE
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~cases, type = "scatter", mode = "lines") %>% add_trace(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines") 

** used chatgpt to find the add_trace function

Step 9: Heatmaps

### FINISH CODE HERE
# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = date))
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} %>%
  dplyr::group_by(new_cases, state) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2021-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = date))
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} %>%
  dplyr::group_by(newper100k, state) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by="weeks")

cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = date))
Warning: Values from `date` are not uniquely identified; output will contain list-cols.
• Use `values_fn = list` to suppress this warning.
• Use `values_fn = {summary_fun}` to summarise duplicates.
• Use the following dplyr code to identify duplicates.
  {data} %>%
  dplyr::group_by(newper100k, state) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L)
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)

I was stuck on this code so could not figure out which states stand out the most.

Step 10: Map

### For specified date

pick.date = "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% filter(date==pick.date) %>% select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Make sure both maps are on the same color scale
shadeLimit <- 125

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", pick.date), limits = c(0,shadeLimit))
Warning: Ignoring 55 observations
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of ', pick.date, '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_pick.date <- fig

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%  select(state, abb, newper100k, cases, deaths) # select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, paste(state_name, '<br>', "Cases per 100k: ", newper100k, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, text = ~hover, locations = ~state,
    color = ~newper100k, colors = 'Purples'
  )
fig <- fig %>% colorbar(title = paste0("Cases per 100k: ", Sys.Date()), limits = c(0,shadeLimit))
Warning: Ignoring 55 observations
fig <- fig %>% layout(
    title = paste('Cases per 100k by State as of', Sys.Date(), '<br>(Hover for value)'),
    geo = set_map_details
  )
fig_Today <- fig


### Plot together 
subplot(fig_pick.date, fig_Today, nrows = 2, margin = .05)

I did not do this code correctly, so I cannot compare the two maps.